#############################################################################
# Gene expression analysis of expression data from human bone
# marrow hematopoietic stem cells
# Organism Homo sapiens
# Dataset: GSE32719 (http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE32719)
# Pang WW, Price EA, Sahoo D, Beerman I et al. Human bone marrow hematopoietic stem cells
# are increased in frequency and myeloid-biased with age.
# Proc Natl Acad Sci U S A 2011 Dec 13;108(50):20012-7.
# PMID: 22123971 (http://www.ncbi.nlm.nih.gov/pubmed/22123971)
# R code: (unknown) double-blind review restriction
##############################################################################
##load required packages
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(biomaRt)
##load the expression data previously was extracted using GEOquery package from hsc_analyses_EG-07022020 code in the processing directory and saved to GSE32719.expression.matrix.csv
GSEdata=read.csv("GSE32719.expression.matrix.csv",stringsAsFactors = FALSE)
geneIDs=GSEdata$X
conditions=colnames(GSEdata[,-1])
# Check the loaded dataset
dim(GSEdata) # Dimension of the dataset
## [1] 54675 28
head(GSEdata) # First few rows
## X GSM812988 GSM812989 GSM812990 GSM812991 GSM812992 GSM812993
## 1 DDR1_1 7.884408 7.808434 7.487010 7.569611 7.250681 6.944877
## 2 RFC2_2 6.737823 6.437587 7.769109 7.368188 7.472075 8.483271
## 3 HSPA6_3 6.627552 6.682880 5.085364 5.515489 5.265894 5.621940
## 4 PAX8_4 7.415271 7.402765 7.685612 7.740810 7.665285 8.014437
## 5 GUCA1A_5 3.338770 3.644936 3.384022 3.611830 3.598129 3.245417
## 6 UBA7_6 9.916951 9.048261 9.833937 9.473971 9.261590 9.694111
## GSM812994 GSM812995 GSM812996 GSM812997 GSM812998 GSM812999 GSM813000
## 1 7.421968 7.358658 7.173638 8.099701 7.894993 8.200512 8.199732
## 2 7.205830 7.752333 7.177863 6.904935 6.502177 7.506968 6.700267
## 3 5.774978 5.288154 5.317504 5.049701 5.321981 5.164705 5.256353
## 4 7.620498 8.117782 8.476150 7.328249 7.622319 7.614827 7.248452
## 5 4.010995 3.238467 3.533082 3.156613 3.127926 3.292835 3.491960
## 6 9.276572 10.254284 10.658883 9.638729 9.365365 9.479343 9.994390
## GSM813001 GSM813002 GSM813003 GSM813004 GSM813005 GSM813006 GSM813007
## 1 7.791689 7.813888 7.901430 8.086425 8.290824 8.142938 5.958269
## 2 6.659768 7.013816 6.993406 6.393265 6.812322 6.579866 6.696320
## 3 5.296228 5.288170 5.520958 5.237739 5.193563 5.107821 4.900028
## 4 7.639622 7.522083 7.213168 7.333272 7.491317 7.482816 6.986802
## 5 3.582134 3.352848 3.469692 3.323915 3.343692 3.475047 3.496106
## 6 9.323403 9.699203 8.737581 9.942752 9.923929 9.534929 9.174486
## GSM813008 GSM813009 GSM813010 GSM813011 GSM813012 GSM813013 GSM813014
## 1 7.752040 7.550791 7.621439 8.045653 8.009282 7.710599 7.814352
## 2 6.325255 6.480209 6.863048 6.416551 6.996908 6.809659 6.830948
## 3 5.382177 5.681456 5.188808 5.028203 5.184249 5.060479 4.818644
## 4 7.967097 7.621571 7.347287 7.573728 7.203639 7.520831 7.523141
## 5 4.571782 3.561773 3.322275 3.405920 3.632146 3.295618 3.587407
## 6 9.848087 9.833564 10.101484 9.965800 10.024517 10.030649 9.750748
tail(GSEdata) # Last few rows
## X GSM812988 GSM812989 GSM812990 GSM812991 GSM812992 GSM812993
## 54670 NA_54670 13.770716 13.994500 11.288592 10.313899 13.538016 14.485988
## 54671 NA_54671 9.891545 10.054197 5.365113 5.982599 9.886868 11.924218
## 54672 NA_54672 12.321272 12.614288 8.244337 8.323074 12.091622 13.694000
## 54673 NA_54673 3.276871 3.351414 3.280466 3.272042 3.395136 3.671908
## 54674 NA_54674 4.238403 3.646006 3.704059 3.981665 4.032586 4.677945
## 54675 NA_54675 4.163933 3.836323 3.706053 4.101131 3.665141 4.500671
## GSM812994 GSM812995 GSM812996 GSM812997 GSM812998 GSM812999 GSM813000
## 54670 14.520052 14.178241 14.078583 14.001334 13.973219 10.529092 9.162595
## 54671 11.868060 10.719133 10.790617 9.724162 9.221820 5.167606 4.645668
## 54672 13.714543 13.069249 12.974032 12.364099 12.008661 7.537253 6.374678
## 54673 3.585744 3.355240 3.767454 3.162290 3.132417 3.306447 3.208896
## 54674 4.093954 4.360997 4.115608 3.697179 3.816241 3.781706 3.673603
## 54675 4.624900 4.104248 4.173879 3.601409 3.598923 3.662857 3.511473
## GSM813001 GSM813002 GSM813003 GSM813004 GSM813005 GSM813006 GSM813007
## 54670 11.017625 13.004120 13.714042 9.474458 9.719739 7.909547 13.485361
## 54671 6.761740 8.422473 10.464575 5.240622 4.334311 4.103851 9.764244
## 54672 7.622667 10.646550 12.351903 6.504358 6.635874 4.999693 11.984164
## 54673 3.450305 3.204290 3.283685 3.243439 3.126628 3.201200 3.421008
## 54674 3.352568 3.690452 3.890352 3.700332 3.783954 3.731791 3.547028
## 54675 3.675848 3.764469 3.912864 3.677868 3.552473 3.737098 3.860874
## GSM813008 GSM813009 GSM813010 GSM813011 GSM813012 GSM813013 GSM813014
## 54670 13.542793 14.371288 14.101520 10.178728 12.701168 12.501800 11.059357
## 54671 9.354944 11.776899 10.798681 6.686006 7.583291 7.336464 6.463792
## 54672 12.126887 13.544123 12.895119 7.408262 10.641604 10.386612 8.562743
## 54673 3.545974 3.524391 3.546822 3.339191 3.466229 3.175700 3.244836
## 54674 3.698521 3.860318 4.038958 3.949230 3.748598 3.557370 3.849345
## 54675 3.745716 4.233577 3.806837 3.704331 3.631327 3.511816 3.504921
raw_data=data.matrix(GSEdata,rownames.force = NA)
## Warning in data.matrix(GSEdata, rownames.force = NA): NAs introduced by coercion
rownames(raw_data)=geneIDs
#write.csv(raw_data,"raw_data.csv")
data1 <- raw_data[,-1]
geneSymbols=gsub("_[0-9]*$", "", rownames(data1))
rownames(data1)=geneSymbols
data<-data1[!rownames(data1) %in% "NA", ]
###################
# Exploratory plots
###################
# Check the behavior of the data
#pdf("plots/histogramRawData.pdf")
hist(data, col = "gray", main="GSE32719 - Histogram")

#dev.off()
# Log2 transformation (why?)
data2 = log2(data)
# Check the behavior of the data after log-transformation
#pdf("plots/Log2histogramRawData.pdf")
hist(data2, col = "gray", main="GSE32719 (log2) - Histogram")

#dev.off()
# Boxplot
#pdf("plots\boxPlotRawData.pdf")
boxplot(data2, col=c("darkgreen", "darkgreen", "darkgreen","darkgreen","darkgreen",
"darkgreen", "darkgreen", "darkgreen","darkgreen","darkgreen",
"darkgreen", "darkgreen","darkgreen","darkred",
"darkred", "darkred", "darkred","darkred","blue", "blue", "blue","blue", "blue", "blue","blue", "blue","blue"),
main="GSE32719 - boxplots", las=2,cex.axis=0.6)

#dev.off()
# Hierarchical clustering of the "samples" based on
# the correlation coefficients of the expression values
hc = hclust(as.dist(1-cor(data2)))
#pdf("plots/HclustRawData.pdf")
plot(hc, main="GSE32719 - Hierarchical Clustering")

#dev.off()
#######################################
# Differential expression (DE) analysis
#######################################
# Separate each of the conditions into three smaller data frames
young = data2[,1:13]
middle = data2[,14:18]
old = data2[,19:26]
YOdata=cbind(young,old)
YMdata=cbind(young,middle)
MOdata=cbind(middle,old)
# Compute the means of the samples of each condition
young.mean = apply(young, 1, mean)
middle.mean = apply(middle, 1, mean)
old.mean = apply(old,1, mean)
head(young.mean)
## DDR1 RFC2 HSPA6 PAX8 GUCA1A UBA7
## 2.931289 2.850180 2.463136 2.941439 1.777652 3.274266
head(middle.mean)
## DDR1 RFC2 HSPA6 PAX8 GUCA1A UBA7
## 2.995422 2.759273 2.407664 2.894989 1.771068 3.250103
head(old.mean)
## DDR1 RFC2 HSPA6 PAX8 GUCA1A UBA7
## 2.919804 2.731692 2.374850 2.898774 1.838715 3.294212
# Just get the maximum of all the means
limit = max(young.mean, middle.mean,old.mean)
# Scatter plots of young vs old and young vs middle aged
#pdf("plots/comparisonConditions.pdf")
par(mfrow=c(2,3))
plot(young.mean ~ old.mean, xlab = "young", ylab = "old",
main = "GSE32719 - Scatter", xlim = c(0, limit), ylim = c(0, limit))
# Diagonal line
abline(0, 1, col = "red")
######compare young and middle##############
plot(young.mean ~ middle.mean, xlab = "young", ylab = "middle",
#main = "GSE32719 - Scatter",
xlim = c(0, limit), ylim = c(0, limit))
# Diagonal line
abline(0, 1, col = "blue")
##########compare middle and old aged patients
plot(middle.mean ~ old.mean, xlab = "middle", ylab = "old",
#main = "GSE32719 - Scatter",
xlim = c(0, limit), ylim = c(0, limit))
# Diagonal line
abline(0, 1, col = "green")
# Compute fold-change (biological significance)
# Difference between the means of the conditions
#par(mfrow=c(1,3))
foldYO = old.mean - young.mean
# Histogram of the fold differences
hist(foldYO, col = "red")
## between M vs Y ####
foldYM = middle.mean - young.mean
# Histogram of the fold differences
hist(foldYM, col = "blue")
## between O vs M ####
foldMO = old.mean - middle.mean
# Histogram of the fold differences
hist(foldMO, col = "green")

#dev.off()
# Compute statistical significance (using t-test)
pvalueYO = NULL # Empty list for the p-values
tstatYO = NULL # Empty list of the t test statistics
pvalueYM = NULL # Empty list for the p-values
tstatYM = NULL # Empty list of the t test statistics
pvalueMO = NULL # Empty list for the p-values
tstatMO = NULL # Empty list of the t test statistics
for(i in 1 : nrow(data)) { # For each gene :
Y = young[i,] # WT of gene number i
M = middle[i,] # KO of gene number i
O = old[i,]
# Compute t-test between the bi- conditions
tYO = t.test(Y, O)
tYM = t.test(Y, M)
tMO = t.test(M, O)
# Put the current p-value in the pvalues list
pvalueYO[i] = tYO$p.value
# Put the current t-statistic in the tstats list
tstatYO[i] = tYO$statistic
# Put the current p-value in the pvalues list
pvalueYM[i] = tYM$p.value
# Put the current t-statistic in the tstats list
tstatYM[i] = tYM$statistic
# Put the current p-value in the pvalues list
pvalueMO[i] = tMO$p.value
# Put the current t-statistic in the tstats list
tstatMO[i] = tMO$statistic
}
head(pvalueYO)
## [1] 0.840425708 0.004623247 0.056844064 0.128518348 0.332272408 0.427825292
head(pvalueYM)
## [1] 0.03335770 0.03992604 0.18210211 0.06058612 0.85200750 0.56131090
head(pvalueMO)
## [1] 0.1963906 0.3939766 0.2645733 0.8814252 0.2668504 0.2989935
#pdf("plots/pvalueByConditionHist.pdf")
par(mfrow=c(3,1))
# Histogram of p-values (-log10)
hist(-log10(pvalueYO), col = "red")
hist(-log10(pvalueYM), col = "blue")
hist(-log10(pvalueMO), col = "green")

#dev.off()
# Volcano: put the biological significance (fold-change)
# and statistical significance (p-value) in one plot
#pdf("plots/foldYO_volcano_I.pdf")
plot(foldYO, -log10(pvalueYO), main = "GSE32719 YO conditions - Volcano")
#dev.off()
fold_cutoff_YO = log2(1.2)
pvalue_cutoff_YO = 0.01
# Screen for the genes that satisfy the filtering criteria
# Fold-change filter for "biological" significance
filter_by_fold_YO = abs(foldYO) >= fold_cutoff_YO
dim(YOdata[filter_by_fold_YO, ])
## [1] 237 21
# P-value filter for "statistical" significance
filter_by_pvalue_YO = pvalueYO <= pvalue_cutoff_YO
dim(YOdata[filter_by_pvalue_YO, ])
## [1] 649 21
# Combined filter (both biological and statistical)
filter_combined_YO = filter_by_fold_YO & filter_by_pvalue_YO
filtered_YO = YOdata[filter_combined_YO,]
dim(filtered_YO)
## [1] 103 21
head(filtered_YO)
## GSM812988 GSM812989 GSM812990 GSM812991 GSM812992 GSM812993 GSM812994
## STMN1 2.199005 2.227024 2.688487 2.748846 2.607545 2.045028 2.579443
## NKX2-3 2.356841 2.173925 2.531269 2.381756 2.696276 2.915205 2.018846
## ZNF483 2.340337 2.076402 2.843332 2.709023 2.576160 2.355085 2.506713
## CNNM2 2.215005 2.143364 2.134958 2.524655 1.997714 2.223072 2.278998
## SLC1A6 1.872153 1.887452 2.409534 2.291756 2.795033 2.906294 2.105433
## CDC42-IT1 2.705048 2.828832 2.478477 2.625209 2.773800 2.557121 2.362751
## GSM812995 GSM812996 GSM812997 GSM812998 GSM812999 GSM813000 GSM813006
## STMN1 2.762775 2.727239 2.527133 2.589342 2.619882 2.557248 2.263959
## NKX2-3 2.666570 2.837494 2.648280 2.715775 2.448178 2.681903 2.897596
## ZNF483 2.623641 2.501987 2.764130 2.921756 2.885695 2.783556 2.987007
## CNNM2 2.747677 1.894273 2.097847 1.953121 2.283668 2.307014 2.095508
## SLC1A6 2.962646 2.632502 2.638485 2.720436 2.419948 1.806488 2.576278
## CDC42-IT1 2.032072 2.272670 2.813177 2.717340 2.374196 2.941816 3.077937
## GSM813007 GSM813008 GSM813009 GSM813010 GSM813011 GSM813012 GSM813013
## STMN1 1.941420 1.861400 2.124118 1.971108 2.220143 2.143900 2.132806
## NKX2-3 2.880764 2.704343 3.232080 3.145029 2.778223 2.776166 3.048084
## ZNF483 2.895563 3.146203 2.644492 3.182856 3.036737 3.004917 3.015023
## CNNM2 1.930292 1.734418 1.925211 1.871978 1.987162 2.009395 1.976887
## SLC1A6 2.980219 2.485324 2.832418 2.893697 2.850576 3.027672 2.905034
## CDC42-IT1 2.733769 2.983628 2.645847 3.080920 3.141138 2.947566 2.698981
# Let's generate the volcano plot again,
# highlighting the significantly differential expressed genes
#pdf("plots/foldYO_volcanoYO)_II.pdf")
plot(foldYO, -log10(pvalueYO), main = "GSE32719 YO conditions - Volcano #2")
points (foldYO[filter_combined_YO], -log10(pvalueYO[filter_combined_YO]),
pch = 16, col = "red")
#dev.off()
# Highlighting up-regulated in red and down-regulated in blue
#pdf("plots/foldYO_volcanoYO_III.pdf")
plot(foldYO, -log10(pvalueYO), main = "GSE32719 YO conditions - Volcano #3")
points (foldYO[filter_combined_YO & foldYO < 0],
-log10(pvalueYO[filter_combined_YO & foldYO < 0]),
pch = 16, col = "blue")
points (foldYO[filter_combined_YO & foldYO > 0],
-log10(pvalueYO[filter_combined_YO & foldYO > 0]),
pch = 16, col = "red")
legend("topleft",c("NO","up", "down"),cex=.8,
col=c("black","red","blue"),pch=c(21,19,19),title="Genes")

#dev.off()
upRegulated_DEGs_YO = foldYO[filter_combined_YO & foldYO > 0]
length(upRegulated_DEGs_YO)
## [1] 70
downRegulated_DEGs_YO = foldYO[filter_combined_YO & foldYO < 0]
length(downRegulated_DEGs_YO)
## [1] 33
#write.csv(names(upRegulated_DEGs_YO), "up_YO.csv")
#write.csv(names(downRegulated_DEGs_YO), "down_YO.csv")
# Cluster the rows (genes) & columns (samples) by correlation
rowvYO = as.dendrogram(hclust(as.dist(1-cor(t(filtered_YO)))))
colvYO = as.dendrogram(hclust(as.dist(1-cor(filtered_YO))))
#pdf("plots/sampleHeatmap_YO.pdf")
heatmap.2(filtered_YO, Rowv=rowvYO,Colv=colvYO,
col = rev(greenred(20*10)), cexRow=0.8,
cexCol=0.8,scale="row",notecol="black",
margins=c(5,5), # ("margin.Y", "margin.X")
trace='none',
symkey=FALSE,
symbreaks=FALSE,
dendrogram='none',
density.info='histogram',
denscol="black",
keysize=1,
#( "bottom.margin", "left.margin", "top.margin", "left.margin" )
key.par=list(mar=c(3.5,0,3,0)))

# lmat -- added 2 lattice sections (5 and 6) for padding
#lmat=rbind(c(3, 2, 1), c(6, 1, 3)), lhei=c(2,3), lwid=c(1, 3, 1))
#library(gplots)
#dev.off()
saveRDS(filtered_YO,"youngOld.RDS")
# Save the DE genes to a text file
#write.csv(filtered_YO, "GSE32719_YO_DE.csv") #sep = "\t",
#quote = FALSE)
#############################################################################
########## YM conditions ################
# Volcano: put the biological significance (fold-change)
# and statistical significance (p-value) in one plot
#pdf("plots/foldYM_volcanoYM_I.pdf")
plot(foldYM, -log10(pvalueYM), main = "GSE32719 YM conditions - Volcano")

#dev.off()
fold_cutoff_YM = log2(1.2)
pvalue_cutoff_YM = 0.01
# Screen for the genes that satisfy the filtering criteria
# Fold-change filter for "biological" significance
filter_by_fold_YM = abs(foldYM) >= fold_cutoff_YM
dim(YMdata[filter_by_fold_YM, ])
## [1] 723 18
# P-value filter for "statistical" significance
filter_by_pvalue_YM = pvalueYM <= pvalue_cutoff_YM
dim(YMdata[filter_by_pvalue_YM, ])
## [1] 1699 18
# Combined filter (both biological and statistical)
filter_combined_YM = filter_by_fold_YM & filter_by_pvalue_YM
filtered_YM = YMdata[filter_combined_YM,]
dim(filtered_YM)
## [1] 291 18
head(filtered_YM)
## GSM812988 GSM812989 GSM812990 GSM812991 GSM812992 GSM812993 GSM812994
## ARL11 2.253317 2.360007 1.928407 1.612269 1.630158 1.704614 1.507577
## KLHDC1 2.754379 2.578578 2.943292 2.836876 2.621134 2.539873 2.139900
## TMX3 1.997022 2.112761 2.467787 2.056554 2.374235 1.942973 1.770114
## EFCAB13 1.740540 1.754988 1.734457 1.856537 2.055267 1.966144 1.768582
## ZNF483 2.340337 2.076402 2.843332 2.709023 2.576160 2.355085 2.506713
## ABCD3 2.578995 2.460471 2.595698 2.634902 2.260064 2.306108 2.136450
## GSM812995 GSM812996 GSM812997 GSM812998 GSM812999 GSM813000 GSM813001
## ARL11 1.557133 1.739600 2.482342 2.352407 2.511399 2.817176 2.739934
## KLHDC1 2.468662 2.258023 3.212565 2.992187 3.079403 2.987093 3.035587
## TMX3 2.081982 1.855680 2.000743 2.301467 2.552403 1.978050 2.336742
## EFCAB13 1.794697 1.792174 2.278635 2.176460 1.766549 1.812122 2.205928
## ZNF483 2.623641 2.501987 2.764130 2.921756 2.885695 2.783556 2.791046
## ABCD3 2.841588 2.531437 2.167222 2.322367 2.620507 2.355732 2.001314
## GSM813002 GSM813003 GSM813004 GSM813005
## ARL11 2.598942 2.460305 2.842819 2.730768
## KLHDC1 3.051546 2.973761 2.950723 3.166616
## TMX3 2.349820 2.283255 2.543769 2.466596
## EFCAB13 2.425393 2.323305 2.179338 2.163854
## ZNF483 2.911935 2.853945 3.008140 3.019280
## ABCD3 2.282001 2.253844 2.159918 2.218047
# Let's generate the volcano plot again,
# highlighting the significantly differential expressed genes
#pdf("plots/foldYM_volcanoYM_II.pdf")
plot(foldYM, -log10(pvalueYM), main = "GSE32719 YM conditions - Volcano #2")
points (foldYM[filter_combined_YM], -log10(pvalueYM[filter_combined_YM]),
pch = 16, col = "red")

#dev.off()
# Highlighting up-regulated in red and down-regulated in blue
#pdf("plots/foldYM_volcanoYM_III.pdf")
plot(foldYM, -log10(pvalueYM), main = "GSE32719 YM conditions - Volcano #3")
points (foldYM[filter_combined_YM & foldYM < 0],
-log10(pvalueYM[filter_combined_YM & foldYM < 0]),
pch = 16, col = "blue")
points (foldYM[filter_combined_YM & foldYM > 0],
-log10(pvalueYM[filter_combined_YM & foldYM > 0]),
pch = 16, col = "red")
legend("topleft",c("NO","down", "up"),cex=.8,
col=c("black","blue","red"),pch=c(21,19,19),title="Genes")

#dev.off()
upRegulated_DEGs_YM = foldYM[filter_combined_YM & foldYM > 0]
length(upRegulated_DEGs_YM)
## [1] 232
downRegulated_DEGs_YM = foldYM[filter_combined_YM & foldYM < 0]
length(downRegulated_DEGs_YM)
## [1] 59
# Cluster the rows (genes) & columns (samples) by correlation
rowvYM = as.dendrogram(hclust(as.dist(1-cor(t(filtered_YM)))))
colvYM = as.dendrogram(hclust(as.dist(1-cor(filtered_YM))))
#Generate a heatmap
#pdf("plots/heatmap_filtered_YM.pdf")
heatmap(filtered_YM, Rowv=rowvYM, Colv=colvYM, cexCol=0.7)

#dev.off()
#pdf("plots/sampleHeatmap_YM.pdf")
heatmap.2(filtered_YM, Rowv=rowvYM,Colv=colvYM,
col = rev(greenred(20*10)),
cexCol=0.8,scale="row",notecol="black",
margins=c(5,5), # ("margin.Y", "margin.X")
trace='none',
symkey=FALSE,
symbreaks=FALSE,
dendrogram='none',
density.info='histogram',
denscol="black",
keysize=1,
key.par=list(mar=c(3.5,0,3,0)))

#dev.off()
heatmap.2(filtered_YM, Rowv=rowvYM, Colv=colvYM, cexCol=0.7,
col = rev(redblue(256)), scale = "row")

#dev.off()
saveRDS(filtered_YM,"youngMiddle.RDS")
# Save the DE genes to a text file
#write.csv (filtered_YM, "GSE32719_YM_DE.csv")#, sep = "\t",
#quote = FALSE)
#write.csv(names(upRegulated_DEGs_YM), "up_YM.csv")
#write.csv(names(downRegulated_DEGs_YM), "down_YM.csv")
#############################################################################
########## MO conditions ################
# Volcano: put the biological significance (fold-change)
# and statistical significance (p-value) in one plot
#pdf("plots/foldMO_volcanoMO_I.pdf")
plot(foldMO, -log10(pvalueMO), main = "GSE32719 MO conditions - Volcano")

#dev.off()
fold_cutoff_MO = log2(1.2)
pvalue_cutoff_MO = 0.01
# Screen for the genes that satisfy the filtering criteria
# Fold-change filter for "biological" significance
filter_by_fold_MO = abs(foldMO) >= fold_cutoff_MO
dim(MOdata[filter_by_fold_MO, ])
## [1] 540 13
# P-value filter for "statistical" significance
filter_by_pvalue_MO = pvalueMO <= pvalue_cutoff_MO
dim(MOdata[filter_by_pvalue_MO, ])
## [1] 311 13
# Combined filter (both biological and statistical)
filter_combined_MO = filter_by_fold_MO & filter_by_pvalue_MO
filtered_MO = MOdata[filter_combined_MO,]
dim(filtered_MO)
## [1] 64 13
head(filtered_MO)
## GSM813001 GSM813002 GSM813003 GSM813004 GSM813005 GSM813006 GSM813007
## STMN1 2.447901 2.315305 2.383504 2.430237 2.347003 2.263959 1.941420
## RPL32P3 2.948676 2.691774 2.759191 2.909474 2.873412 2.828554 2.401848
## PCBP1-AS1 3.064795 2.795942 2.740716 2.743369 2.811181 2.833281 2.453565
## IRAK3 2.783460 2.936768 2.640736 2.913213 2.739505 2.555179 2.525516
## HECTD1 2.754573 2.816688 2.733798 2.831142 2.653255 2.669166 2.197340
## AHR 2.867518 3.080271 3.115500 3.116647 3.204473 3.179465 2.762232
## GSM813008 GSM813009 GSM813010 GSM813011 GSM813012 GSM813013
## STMN1 1.861400 2.124118 1.971108 2.220143 2.143900 2.132806
## RPL32P3 2.347996 2.361257 2.505445 2.875283 2.601851 2.579124
## PCBP1-AS1 2.333555 2.082120 2.329325 2.664067 2.254540 2.585855
## IRAK3 2.549802 2.147624 2.278666 2.598872 2.611929 2.794008
## HECTD1 2.151519 2.267669 2.540697 2.688946 2.622837 2.646879
## AHR 2.362739 2.596570 2.673025 2.751306 2.790130 2.748587
# Let's generate the volcano plot again,
# highlighting the significantly differential expressed genes
#pdf("plots/foldMO_volcanoMO_II.pdf")
plot(foldMO, -log10(pvalueMO), main = "GSE32719 MO conditions - Volcano #2")
points (foldMO[filter_combined_MO], -log10(pvalueMO[filter_combined_MO]),
pch = 16, col = "red")

#dev.off()
# Highlighting up-regulated in red and down-regulated in blue
#pdf("plots/foldMO_volcanoMO_III.pdf")
plot(foldMO, -log10(pvalueMO), main = "GSE32719 MO conditions - Volcano #3")
points (foldMO[filter_combined_MO & foldMO < 0],
-log10(pvalueMO[filter_combined_MO & foldMO < 0]),
pch = 16, col = "blue")
points (foldMO[filter_combined_MO & foldMO > 0],
-log10(pvalueMO[filter_combined_MO & foldMO > 0]),
pch = 16, col = "red")
legend("topleft",c("NO","down", "up"),cex=.8,
col=c("black","blue","red"),pch=c(21,19,19),title="Genes")

#dev.off()
upRegulated_DEGs_MO = foldMO[filter_combined_MO & foldMO > 0]
length(upRegulated_DEGs_MO)
## [1] 25
downRegulated_DEGs_MO = foldMO[filter_combined_MO & foldMO < 0]
length(downRegulated_DEGs_MO)
## [1] 39
# Cluster the rows (genes) & columns (samples) by correlation
rowvMO = as.dendrogram(hclust(as.dist(1-cor(t(filtered_MO)))))
colvMO = as.dendrogram(hclust(as.dist(1-cor(filtered_MO))))
# Generate a heatmap
#pdf("plots/heatmap_filtered_MO.pdf")
heatmap(filtered_MO, Rowv=rowvMO, Colv=colvMO, cexCol=0.7)

#dev.off()
#pdf("plots/sampleHeatmap_MO.pdf")
heatmap.2(filtered_MO, Rowv=rowvMO,Colv=colvMO,
col = rev(greenred(20*10)),
cexCol=0.8,scale="row",notecol="black",
margins=c(5,5), # ("margin.Y", "margin.X")
trace='none',
symkey=FALSE,
symbreaks=FALSE,
dendrogram='none',
density.info='histogram',
denscol="black",
keysize=1,
key.par=list(mar=c(3.5,0,3,0)))

#dev.off()
# Enhanced heatmap
#pdf("plots/heatmap_filtered_MO.pdf")
heatmap.2(filtered_MO, Rowv=rowvMO, Colv=colvMO, cexCol=0.7,
col = rev(redblue(256)), scale = "row")

#dev.off()
saveRDS(filtered_MO,"middleOld.RDS")
# Save the DE genes to a text file
#write.csv (filtered_MO, "GSE32719_MO_DE.csv")#, sep = "\t",
#quote = FALSE)
#write.csv(names(upRegulated_DEGs_MO),"up_MO.csv")
#write.csv(names(downRegulated_DEGs_MO),"down_MO.csv")
#############################################################################